{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2c86c7be",
   "metadata": {},
   "source": [
    "# Chapter 5 - Person Matching"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86dd1e56",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import json\n",
    "import zipfile\n",
    "import io\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b476eeef",
   "metadata": {},
   "source": [
    "## Step 1 - Data Acquisition"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf81046d",
   "metadata": {},
   "source": [
    "### Wikipedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcbd8f6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the Wikipedia webpage and find all tables.\n",
    "\n",
    "url = \"https://en.wikipedia.org/wiki/List_of_MPs_elected_in_the_2019_United_Kingdom_general_election\"\n",
    "\n",
    "website_page = requests.get(url).text\n",
    "soup = BeautifulSoup(website_page,'html.parser')\n",
    "tables = soup.find_all('table')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "647b6d9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find the table with members returned, extract rows as a list of lists and load into dataframe\n",
    "\n",
    "for table in tables:\n",
    "    if 'Member returned' in table.text:\n",
    "        headers = [header.text.strip() for header in table.find_all('th')]\n",
    "        headers = headers[:5]\n",
    "        dfrows = []\n",
    "        table_rows = table.find_all('tr')    \n",
    "        for row in table_rows:\n",
    "            td = row.find_all('td')\n",
    "            dfrow = [row.text for row in td if row.text!='\\n']\n",
    "            tdlink = row.find_all(\"td\", {\"data-sort-value\" : True})\n",
    "            for element in tdlink:\n",
    "                for link in element.select(\"a[title]\"):\n",
    "                    urltail = link['href']\n",
    "                    url = f'https://en.wikipedia.org{urltail}'         \n",
    "            dfrow.append(url)\n",
    "            dfrows.append(dfrow)\n",
    "        headers.append('Wikilink')\n",
    "df_w = pd.DataFrame(dfrows, columns=headers)\n",
    "df_w = df_w.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bee4747-d86c-4422-b2b8-99d9322427e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "838c7b4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_bday(url):\n",
    "    wiki_page = requests.get(url).text\n",
    "    soup = BeautifulSoup(wiki_page,'html.parser')\n",
    "    bday = ''\n",
    "    bdayelement = soup.select_one(\"span[class='bday']\")\n",
    "    if bdayelement is not None:\n",
    "        bday = bdayelement.text\n",
    "    return(bday)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96bb6e73",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w['Birthday'] = df_w.apply(lambda x: get_bday(x.Wikilink), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5fc7bf69",
   "metadata": {},
   "source": [
    "### Companies House Persons of Significant Control"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7309236b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# UK Companies House Persons with Significant Control Download Page\n",
    "\n",
    "url = \"http://download.companieshouse.gov.uk/en_pscdata.html\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c568806e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download snapshots, convert json to dataframe\n",
    "# Ensure all necessary columns are present in each snapshot dataframe\n",
    "# Drop rows if any of: company number, surname, forename, month of birth or year of birth are missing\n",
    "# Append to a single dataframe\n",
    "\n",
    "df_psctotal = pd.DataFrame()\n",
    "with requests.Session() as req:\n",
    "        r = req.get(url)\n",
    "        soup = BeautifulSoup(r.content, 'html.parser')\n",
    "        snapshots = [f\"{url[:38]}{item['href']}\" for item in soup.select(\n",
    "              \"a[href*='psc-snapshot']\")]\n",
    "        for snapshot in snapshots:   \n",
    "            print(snapshot)\n",
    "            response = requests.get(snapshot).content     \n",
    "            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))         \n",
    "            tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])\n",
    "            df_psc = pd.json_normalize(pd.Series(open(tempfile, encoding=\"utf8\").readlines()).apply(json.loads))\n",
    "                \n",
    "            must_cols = ['company_number','data.name_elements.surname','data.name_elements.middle_name','data.name_elements.forename','data.date_of_birth.month','data.date_of_birth.year','data.name_elements.title','data.nationality']\n",
    "            all_cols =list(set(df_psc.columns).union(must_cols))\n",
    "            df_psc=df_psc.reindex(columns=sorted(all_cols))\n",
    "              \n",
    "            df_psc = df_psc.dropna(subset=['company_number','data.name_elements.surname','data.name_elements.forename','data.date_of_birth.month','data.date_of_birth.year'])       \n",
    "            df_psc = df_psc[must_cols] \n",
    "            df_psctotal = pd.concat([df_psctotal, df_psc], ignore_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "92e22091",
   "metadata": {},
   "source": [
    "### Saving to Local Storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "237e9ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.to_csv('mps_wiki_bday_raw.csv', index=False)\n",
    "df_psctotal.to_csv('psc_raw.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4863f4d",
   "metadata": {},
   "source": [
    "## Step 2 - Data Standardization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0696e791",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w = pd.read_csv('mps_wiki_bday_raw.csv')\n",
    "df_psc = pd.read_csv('psc_raw.csv',dtype={'data.name_elements.surname':'string',\n",
    "                                           'data.name_elements.forename':'string',\n",
    "                                           'data.name_elements.middle_name':'string',\n",
    "                                           'data.name_elements.title':'string',\n",
    "                                           'data.nationality':'string'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "141137ec-9cd4-4b40-a70c-122e3122e85b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w = pd.read_csv('mps_wiki_bday_raw.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cae788d6-d455-4233-9551-4a755675340d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.head(n=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc69204a",
   "metadata": {},
   "source": [
    "### Wikipedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a93c431",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w = df_w.dropna()\n",
    "df_w['Year'] = pd.to_datetime(df_w['Birthday']).dt.year.astype('int64')\n",
    "df_w['Month'] = pd.to_datetime(df_w['Birthday']).dt.month.astype('int64')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd64b603",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w = df_w.rename(columns={ 'Member returned' : 'Fullname'})\n",
    "df_w['Fullname'] = df_w['Fullname'].str.rstrip(\"\\n\")\n",
    "df_w['Fullname'] = df_w['Fullname'].str.lstrip(\"\\n\")\n",
    "df_w['Firstname'] = df_w['Fullname'].str.split().str[0]\n",
    "df_w['Lastname'] = df_w['Fullname'].astype(str).apply(lambda x: ' '.join(x.split()[1:]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cde97939",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w['unique_id'] = df_w.index\n",
    "df_w[\"company_number\"] = np.nan\n",
    "df_w=df_w[['Firstname','Lastname','Month','Year','unique_id','company_number']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7227505",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_w.head(n=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db6f250f",
   "metadata": {},
   "source": [
    "### Companies House Persons of Significant Control"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "398fff11",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename and convert columns for matching\n",
    "\n",
    "df_psc = df_psc.dropna(subset=['data.date_of_birth.year', 'data.date_of_birth.month'])\n",
    "\n",
    "df_psc['Year'] = df_psc['data.date_of_birth.year'].astype('int64')\n",
    "df_psc['Month'] = df_psc['data.date_of_birth.month'].astype('int64')\n",
    "df_psc['Firstname']=df_psc['data.name_elements.forename']\n",
    "df_psc['Lastname']=df_psc['data.name_elements.surname']\n",
    "\n",
    "# Create unique index column needed by Splink from dataframe index\n",
    "# Subset down to required columns\n",
    "\n",
    "df_psc['unique_id'] = df_psc.index\n",
    "df_psc = df_psc[['Lastname','Firstname','company_number','Year','Month','unique_id']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc0c679a",
   "metadata": {},
   "source": [
    "### Saving to Local Storage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6946d68e",
   "metadata": {},
   "outputs": [],
   "source": [
    "#df_w.to_csv('mps_wiki_bday_clean.csv', index=False)\n",
    "df_w = pd.read_csv('mps_wiki_bday_clean.csv')\n",
    "df_psc.to_csv('psc_clean.csv', index=False)\n",
    "df_psc = pd.read_csv('psc_clean.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d2285360",
   "metadata": {},
   "source": [
    "# Step 3 - Record Blocking and Attribute Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e594a0bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_psc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac5b4420",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Count of simple merge on matching Year and Month \n",
    "\n",
    "df_mp = df_w.merge(df_psc, on=['Year','Month'], suffixes=('_w', '_psc'))\n",
    "len(df_mp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "854e6cf4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate exact match using a simple join\n",
    "\n",
    "df_result = df_w.merge(df_psc, on=['Lastname','Firstname','Year','Month'], suffixes=('_w', '_psc'))\n",
    "df_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a21f488",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Splink settings to block on year and month matches and then compare First and Last names\n",
    "\n",
    "from splink.duckdb.linker import DuckDBLinker\n",
    "from splink.duckdb import comparison_library as cl\n",
    "settings = {\n",
    "    \"link_type\": \"link_only\",\n",
    "    \"blocking_rules_to_generate_predictions\": [\n",
    "        \"l.Year = r.Year and l.Month = r.Month\"\n",
    "    ],\n",
    "    \"comparisons\": [\n",
    "        cl.jaro_winkler_at_thresholds(\"Firstname\", [0.9]),\n",
    "        cl.jaro_winkler_at_thresholds(\"Lastname\", [0.9]),\n",
    "        cl.exact_match(\"Month\"),\n",
    "        cl.exact_match(\"Year\", term_frequency_adjustments=True),\n",
    "    ],\n",
    "    \"additional_columns_to_retain\": [\"company_number\"]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2598620d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup linker and profile columns\n",
    "\n",
    "linker = DuckDBLinker([df_w, df_psc], settings, input_table_aliases=[\"df_w\", \"df_psc\"])\n",
    "linker.profile_columns([\"Firstname\",\"Lastname\",\"Month\",\"Year\"], top_n=10, bottom_n=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "467b9ff3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Estimate u values\n",
    "\n",
    "linker.estimate_u_using_random_sampling(max_pairs=1e7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "436a7a98",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Calculate m values \n",
    "\n",
    "linker.estimate_parameters_using_expectation_maximisation(\"l.Lastname = r.Lastname and l.Month = r.Month\", fix_u_probabilities=False)\n",
    "linker.estimate_parameters_using_expectation_maximisation(\"l.Firstname = r.Firstname and  l.Year = r.Year\", fix_u_probabilities=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a3c51e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "#linker.save_model_to_json(\"Chapter5_Splink_Settings.json\", overwrite=True)\n",
    "linker.load_settings(\"Chapter5_Splink_Settings.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39bbc0bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.match_weights_chart()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fef8ea8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "linker.m_u_parameters_chart()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3bad48fa",
   "metadata": {},
   "source": [
    "# Step 4 - Match Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90b634a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Predict matches and convert to dataframe\n",
    "\n",
    "results = linker.predict(threshold_match_probability=0.99)\n",
    "pres = results.as_pandas_dataframe()\n",
    "pres = pres.rename(columns={\"Firstname_l\": \"Firstname_psc\", \n",
    "                            \"Lastname_l\": \"Lastname_psc\",\n",
    "                            \"Firstname_r\":\"Firstname_w\",\n",
    "                            \"Lastname_r\":\"Lastname_w\",\n",
    "                            \"company_number_l\":\"company_number\"})\n",
    "pres = pres[['match_weight','match_probability','Firstname_psc','Firstname_w', 'Lastname_psc','Lastname_w','company_number','unique_id_r']]\n",
    "pres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "535255a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select matches that aren't exact\n",
    "\n",
    "pres[(pres['Lastname_psc']!=pres['Lastname_w']) | (pres['Firstname_psc']!=pres['Firstname_w'])]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7fc6fbd",
   "metadata": {},
   "source": [
    "# Chapter 7 Extension "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4e785c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_pres = pres.groupby(['unique_id_r'], sort=False).agg(lambda x: list(set(x))).reset_index()\n",
    "df_pres[(df_pres['company_number'].apply(len)>2) & df_pres['Lastname_psc'].apply(len)==1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d85e1d4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "handsonentityresolution",
   "language": "python",
   "name": "handsonentityresolution"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
